In [5]:
#Configuraciones de Python
import pandas as pd
pd.options.display.max_rows = 10
In [6]:
from IPython.display import Image
Image(filename='/Users/oldemarrodriguez/Google Drive/MDCurso/Datos/logo_python2.png')
Out[6]:

Visualización de Datos

In [7]:
import matplotlib.pyplot as plt
import numpy as np

Ejemplos de grƔficos de lƭneas (graficando funciones)

In [8]:
# Datos del eje X para los siguientes grƔficos
x = np.linspace(0, 10, 100)
print(x)
[ 0.          0.1010101   0.2020202   0.3030303   0.4040404   0.50505051
  0.60606061  0.70707071  0.80808081  0.90909091  1.01010101  1.11111111
  1.21212121  1.31313131  1.41414141  1.51515152  1.61616162  1.71717172
  1.81818182  1.91919192  2.02020202  2.12121212  2.22222222  2.32323232
  2.42424242  2.52525253  2.62626263  2.72727273  2.82828283  2.92929293
  3.03030303  3.13131313  3.23232323  3.33333333  3.43434343  3.53535354
  3.63636364  3.73737374  3.83838384  3.93939394  4.04040404  4.14141414
  4.24242424  4.34343434  4.44444444  4.54545455  4.64646465  4.74747475
  4.84848485  4.94949495  5.05050505  5.15151515  5.25252525  5.35353535
  5.45454545  5.55555556  5.65656566  5.75757576  5.85858586  5.95959596
  6.06060606  6.16161616  6.26262626  6.36363636  6.46464646  6.56565657
  6.66666667  6.76767677  6.86868687  6.96969697  7.07070707  7.17171717
  7.27272727  7.37373737  7.47474747  7.57575758  7.67676768  7.77777778
  7.87878788  7.97979798  8.08080808  8.18181818  8.28282828  8.38383838
  8.48484848  8.58585859  8.68686869  8.78787879  8.88888889  8.98989899
  9.09090909  9.19191919  9.29292929  9.39393939  9.49494949  9.5959596
  9.6969697   9.7979798   9.8989899  10.        ]

Nota: Lo siguiente se ejecuta todo junto

In [9]:
plt.plot(x, np.sin(x))
plt.plot(x, np.cos(x))
Out[9]:
[<matplotlib.lines.Line2D at 0x11960e8d0>]

Paneles de gráficos

In [10]:
plt.figure()  # crea la figura
# Crea el primer panel
plt.subplot(2, 1, 1) # (filas, columnas, nĆŗmero de paneles)
plt.plot(x, np.sin(x))
# crea el segundo panel
plt.subplot(2, 1, 2)
plt.plot(x, np.cos(x))
Out[10]:
[<matplotlib.lines.Line2D at 0x1199e7e48>]

Un estilo Orientado a Objetos para situaciones mƔs complejas

In [11]:
fig, ax = plt.subplots(2)
# Llama el mƩtodo plot() method 
ax[0].plot(x, np.sin(x))
ax[1].plot(x, np.cos(x))
Out[11]:
[<matplotlib.lines.Line2D at 0x119ae69e8>]

Usando colores

In [12]:
plt.plot(x, np.sin(x - 0), color='blue')        # Nombre del color
plt.plot(x, np.sin(x - 1), color='g')           # Código del color (rgbcmyk)
plt.plot(x, np.sin(x - 2), color='0.75')        # escala de gris entre 0 y 1
plt.plot(x, np.sin(x - 3), color='#FFDD44')     # Código exadecimal (RRGGBB from 00 to FF)
plt.plot(x, np.sin(x - 4), color=(1.0,0.2,0.3)) # Tupla RGB entre 0 y 1
plt.plot(x, np.sin(x - 5), color='chartreuse') # Nombres de color en HTML
plt.plot(x, x + 0, linestyle='solid')
plt.plot(x, x + 1, linestyle='dashed')
plt.plot(x, x + 2, linestyle='dashdot')
plt.plot(x, x + 3, linestyle='dotted')
# Lo mismo pero con código
plt.plot(x, x + 4, linestyle='-')  
plt.plot(x, x + 5, linestyle='--') 
plt.plot(x, x + 6, linestyle='-.') 
plt.plot(x, x + 7, linestyle=':') 
Out[12]:
[<matplotlib.lines.Line2D at 0x119bdd518>]

Cambiando los límites de los ejes

In [13]:
plt.plot(x, np.sin(x))
plt.xlim(-1, 11)
plt.ylim(-1.5, 1.5)
Out[13]:
(-1.5, 1.5)

Títulos

In [14]:
plt.plot(x, np.sin(x))
plt.title("Función Seno(x)")
plt.xlabel("x")
plt.ylabel("Seno(x)") 
Out[14]:
Text(0,0.5,'Seno(x)')

Leyendas

In [15]:
plt.plot(x, np.sin(x), '-g', label='Seno(x)')
plt.plot(x, np.cos(x), ':b', label='Coseno(x)')
plt.axis('equal')
plt.legend()
Out[15]:
<matplotlib.legend.Legend at 0x119e2ba90>

Estilo Orientado a Objetos

In [16]:
ax = plt.axes()
ax.plot(x, np.sin(x))
ax.set(xlim=(0, 10), ylim=(-2, 2),
       xlabel='x', ylabel='Seno(x)',
       title='Un ploteo de Seno(x)')
Out[16]:
[(-2, 2),
 Text(0,0.5,'Seno(x)'),
 (0, 10),
 Text(0.5,0,'x'),
 Text(0.5,1,'Un ploteo de Seno(x)')]

A la izquierda como función de matplotlib

A la derecha como mƩtodo del objeto ax

plt.xlabel() → ax.set_xlabel()

plt.ylabel() → ax.set_ylabel()

plt.xlim() → ax.set_xlim()

plt.ylim() → ax.set_ylim()

plt.title() → ax.set_title()

Gráficos scatter plot = Ejes XY

In [17]:
x = np.linspace(0, 10, 30)
y = np.sin(x)
plt.plot(x, y, 'o', color='black')
Out[17]:
[<matplotlib.lines.Line2D at 0x119d08b00>]

Ejemplo con los datos Iris

In [18]:
from sklearn.datasets import load_iris
iris = load_iris()
print(iris)

caracteristicas = iris.data.T
print(caracteristicas)

plt.scatter(caracteristicas[0], caracteristicas[1], 
            alpha=0.2,s=100*caracteristicas[3], c=iris.target, cmap='viridis')
plt.xlabel(iris.feature_names[0])
plt.ylabel(iris.feature_names[1])
{'data': array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.2],
       [5. , 3.2, 1.2, 0.2],
       [5.5, 3.5, 1.3, 0.2],
       [4.9, 3.6, 1.4, 0.1],
       [4.4, 3. , 1.3, 0.2],
       [5.1, 3.4, 1.5, 0.2],
       [5. , 3.5, 1.3, 0.3],
       [4.5, 2.3, 1.3, 0.3],
       [4.4, 3.2, 1.3, 0.2],
       [5. , 3.5, 1.6, 0.6],
       [5.1, 3.8, 1.9, 0.4],
       [4.8, 3. , 1.4, 0.3],
       [5.1, 3.8, 1.6, 0.2],
       [4.6, 3.2, 1.4, 0.2],
       [5.3, 3.7, 1.5, 0.2],
       [5. , 3.3, 1.4, 0.2],
       [7. , 3.2, 4.7, 1.4],
       [6.4, 3.2, 4.5, 1.5],
       [6.9, 3.1, 4.9, 1.5],
       [5.5, 2.3, 4. , 1.3],
       [6.5, 2.8, 4.6, 1.5],
       [5.7, 2.8, 4.5, 1.3],
       [6.3, 3.3, 4.7, 1.6],
       [4.9, 2.4, 3.3, 1. ],
       [6.6, 2.9, 4.6, 1.3],
       [5.2, 2.7, 3.9, 1.4],
       [5. , 2. , 3.5, 1. ],
       [5.9, 3. , 4.2, 1.5],
       [6. , 2.2, 4. , 1. ],
       [6.1, 2.9, 4.7, 1.4],
       [5.6, 2.9, 3.6, 1.3],
       [6.7, 3.1, 4.4, 1.4],
       [5.6, 3. , 4.5, 1.5],
       [5.8, 2.7, 4.1, 1. ],
       [6.2, 2.2, 4.5, 1.5],
       [5.6, 2.5, 3.9, 1.1],
       [5.9, 3.2, 4.8, 1.8],
       [6.1, 2.8, 4. , 1.3],
       [6.3, 2.5, 4.9, 1.5],
       [6.1, 2.8, 4.7, 1.2],
       [6.4, 2.9, 4.3, 1.3],
       [6.6, 3. , 4.4, 1.4],
       [6.8, 2.8, 4.8, 1.4],
       [6.7, 3. , 5. , 1.7],
       [6. , 2.9, 4.5, 1.5],
       [5.7, 2.6, 3.5, 1. ],
       [5.5, 2.4, 3.8, 1.1],
       [5.5, 2.4, 3.7, 1. ],
       [5.8, 2.7, 3.9, 1.2],
       [6. , 2.7, 5.1, 1.6],
       [5.4, 3. , 4.5, 1.5],
       [6. , 3.4, 4.5, 1.6],
       [6.7, 3.1, 4.7, 1.5],
       [6.3, 2.3, 4.4, 1.3],
       [5.6, 3. , 4.1, 1.3],
       [5.5, 2.5, 4. , 1.3],
       [5.5, 2.6, 4.4, 1.2],
       [6.1, 3. , 4.6, 1.4],
       [5.8, 2.6, 4. , 1.2],
       [5. , 2.3, 3.3, 1. ],
       [5.6, 2.7, 4.2, 1.3],
       [5.7, 3. , 4.2, 1.2],
       [5.7, 2.9, 4.2, 1.3],
       [6.2, 2.9, 4.3, 1.3],
       [5.1, 2.5, 3. , 1.1],
       [5.7, 2.8, 4.1, 1.3],
       [6.3, 3.3, 6. , 2.5],
       [5.8, 2.7, 5.1, 1.9],
       [7.1, 3. , 5.9, 2.1],
       [6.3, 2.9, 5.6, 1.8],
       [6.5, 3. , 5.8, 2.2],
       [7.6, 3. , 6.6, 2.1],
       [4.9, 2.5, 4.5, 1.7],
       [7.3, 2.9, 6.3, 1.8],
       [6.7, 2.5, 5.8, 1.8],
       [7.2, 3.6, 6.1, 2.5],
       [6.5, 3.2, 5.1, 2. ],
       [6.4, 2.7, 5.3, 1.9],
       [6.8, 3. , 5.5, 2.1],
       [5.7, 2.5, 5. , 2. ],
       [5.8, 2.8, 5.1, 2.4],
       [6.4, 3.2, 5.3, 2.3],
       [6.5, 3. , 5.5, 1.8],
       [7.7, 3.8, 6.7, 2.2],
       [7.7, 2.6, 6.9, 2.3],
       [6. , 2.2, 5. , 1.5],
       [6.9, 3.2, 5.7, 2.3],
       [5.6, 2.8, 4.9, 2. ],
       [7.7, 2.8, 6.7, 2. ],
       [6.3, 2.7, 4.9, 1.8],
       [6.7, 3.3, 5.7, 2.1],
       [7.2, 3.2, 6. , 1.8],
       [6.2, 2.8, 4.8, 1.8],
       [6.1, 3. , 4.9, 1.8],
       [6.4, 2.8, 5.6, 2.1],
       [7.2, 3. , 5.8, 1.6],
       [7.4, 2.8, 6.1, 1.9],
       [7.9, 3.8, 6.4, 2. ],
       [6.4, 2.8, 5.6, 2.2],
       [6.3, 2.8, 5.1, 1.5],
       [6.1, 2.6, 5.6, 1.4],
       [7.7, 3. , 6.1, 2.3],
       [6.3, 3.4, 5.6, 2.4],
       [6.4, 3.1, 5.5, 1.8],
       [6. , 3. , 4.8, 1.8],
       [6.9, 3.1, 5.4, 2.1],
       [6.7, 3.1, 5.6, 2.4],
       [6.9, 3.1, 5.1, 2.3],
       [5.8, 2.7, 5.1, 1.9],
       [6.8, 3.2, 5.9, 2.3],
       [6.7, 3.3, 5.7, 2.5],
       [6.7, 3. , 5.2, 2.3],
       [6.3, 2.5, 5. , 1.9],
       [6.5, 3. , 5.2, 2. ],
       [6.2, 3.4, 5.4, 2.3],
       [5.9, 3. , 5.1, 1.8]]), 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]), 'target_names': array(['setosa', 'versicolor', 'virginica'], dtype='<U10'), 'DESCR': '.. _iris_dataset:\n\nIris plants dataset\n--------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 150 (50 in each of three classes)\n    :Number of Attributes: 4 numeric, predictive attributes and the class\n    :Attribute Information:\n        - sepal length in cm\n        - sepal width in cm\n        - petal length in cm\n        - petal width in cm\n        - class:\n                - Iris-Setosa\n                - Iris-Versicolour\n                - Iris-Virginica\n                \n    :Summary Statistics:\n\n    ============== ==== ==== ======= ===== ====================\n                    Min  Max   Mean    SD   Class Correlation\n    ============== ==== ==== ======= ===== ====================\n    sepal length:   4.3  7.9   5.84   0.83    0.7826\n    sepal width:    2.0  4.4   3.05   0.43   -0.4194\n    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)\n    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)\n    ============== ==== ==== ======= ===== ====================\n\n    :Missing Attribute Values: None\n    :Class Distribution: 33.3% for each of 3 classes.\n    :Creator: R.A. Fisher\n    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)\n    :Date: July, 1988\n\nThe famous Iris database, first used by Sir R.A. Fisher. The dataset is taken\nfrom Fisher\'s paper. Note that it\'s the same as in R, but not as in the UCI\nMachine Learning Repository, which has two wrong data points.\n\nThis is perhaps the best known database to be found in the\npattern recognition literature.  Fisher\'s paper is a classic in the field and\nis referenced frequently to this day.  (See Duda & Hart, for example.)  The\ndata set contains 3 classes of 50 instances each, where each class refers to a\ntype of iris plant.  One class is linearly separable from the other 2; the\nlatter are NOT linearly separable from each other.\n\n.. topic:: References\n\n   - Fisher, R.A. "The use of multiple measurements in taxonomic problems"\n     Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to\n     Mathematical Statistics" (John Wiley, NY, 1950).\n   - Duda, R.O., & Hart, P.E. (1973) Pattern Classification and Scene Analysis.\n     (Q327.D83) John Wiley & Sons.  ISBN 0-471-22361-1.  See page 218.\n   - Dasarathy, B.V. (1980) "Nosing Around the Neighborhood: A New System\n     Structure and Classification Rule for Recognition in Partially Exposed\n     Environments".  IEEE Transactions on Pattern Analysis and Machine\n     Intelligence, Vol. PAMI-2, No. 1, 67-71.\n   - Gates, G.W. (1972) "The Reduced Nearest Neighbor Rule".  IEEE Transactions\n     on Information Theory, May 1972, 431-433.\n   - See also: 1988 MLC Proceedings, 54-64.  Cheeseman et al"s AUTOCLASS II\n     conceptual clustering system finds 3 classes in the data.\n   - Many, many more ...', 'feature_names': ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], 'filename': '/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/sklearn/datasets/data/iris.csv'}
[[5.1 4.9 4.7 4.6 5.  5.4 4.6 5.  4.4 4.9 5.4 4.8 4.8 4.3 5.8 5.7 5.4 5.1
  5.7 5.1 5.4 5.1 4.6 5.1 4.8 5.  5.  5.2 5.2 4.7 4.8 5.4 5.2 5.5 4.9 5.
  5.5 4.9 4.4 5.1 5.  4.5 4.4 5.  5.1 4.8 5.1 4.6 5.3 5.  7.  6.4 6.9 5.5
  6.5 5.7 6.3 4.9 6.6 5.2 5.  5.9 6.  6.1 5.6 6.7 5.6 5.8 6.2 5.6 5.9 6.1
  6.3 6.1 6.4 6.6 6.8 6.7 6.  5.7 5.5 5.5 5.8 6.  5.4 6.  6.7 6.3 5.6 5.5
  5.5 6.1 5.8 5.  5.6 5.7 5.7 6.2 5.1 5.7 6.3 5.8 7.1 6.3 6.5 7.6 4.9 7.3
  6.7 7.2 6.5 6.4 6.8 5.7 5.8 6.4 6.5 7.7 7.7 6.  6.9 5.6 7.7 6.3 6.7 7.2
  6.2 6.1 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.  6.9 6.7 6.9 5.8 6.8
  6.7 6.7 6.3 6.5 6.2 5.9]
 [3.5 3.  3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 3.7 3.4 3.  3.  4.  4.4 3.9 3.5
  3.8 3.8 3.4 3.7 3.6 3.3 3.4 3.  3.4 3.5 3.4 3.2 3.1 3.4 4.1 4.2 3.1 3.2
  3.5 3.6 3.  3.4 3.5 2.3 3.2 3.5 3.8 3.  3.8 3.2 3.7 3.3 3.2 3.2 3.1 2.3
  2.8 2.8 3.3 2.4 2.9 2.7 2.  3.  2.2 2.9 2.9 3.1 3.  2.7 2.2 2.5 3.2 2.8
  2.5 2.8 2.9 3.  2.8 3.  2.9 2.6 2.4 2.4 2.7 2.7 3.  3.4 3.1 2.3 3.  2.5
  2.6 3.  2.6 2.3 2.7 3.  2.9 2.9 2.5 2.8 3.3 2.7 3.  2.9 3.  3.  2.5 2.9
  2.5 3.6 3.2 2.7 3.  2.5 2.8 3.2 3.  3.8 2.6 2.2 3.2 2.8 2.8 2.7 3.3 3.2
  2.8 3.  2.8 3.  2.8 3.8 2.8 2.8 2.6 3.  3.4 3.1 3.  3.1 3.1 3.1 2.7 3.2
  3.3 3.  2.5 3.  3.4 3. ]
 [1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 1.5 1.6 1.4 1.1 1.2 1.5 1.3 1.4
  1.7 1.5 1.7 1.5 1.  1.7 1.9 1.6 1.6 1.5 1.4 1.6 1.6 1.5 1.5 1.4 1.5 1.2
  1.3 1.4 1.3 1.5 1.3 1.3 1.3 1.6 1.9 1.4 1.6 1.4 1.5 1.4 4.7 4.5 4.9 4.
  4.6 4.5 4.7 3.3 4.6 3.9 3.5 4.2 4.  4.7 3.6 4.4 4.5 4.1 4.5 3.9 4.8 4.
  4.9 4.7 4.3 4.4 4.8 5.  4.5 3.5 3.8 3.7 3.9 5.1 4.5 4.5 4.7 4.4 4.1 4.
  4.4 4.6 4.  3.3 4.2 4.2 4.2 4.3 3.  4.1 6.  5.1 5.9 5.6 5.8 6.6 4.5 6.3
  5.8 6.1 5.1 5.3 5.5 5.  5.1 5.3 5.5 6.7 6.9 5.  5.7 4.9 6.7 4.9 5.7 6.
  4.8 4.9 5.6 5.8 6.1 6.4 5.6 5.1 5.6 6.1 5.6 5.5 4.8 5.4 5.6 5.1 5.1 5.9
  5.7 5.2 5.  5.2 5.4 5.1]
 [0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 0.2 0.2 0.1 0.1 0.2 0.4 0.4 0.3
  0.3 0.3 0.2 0.4 0.2 0.5 0.2 0.2 0.4 0.2 0.2 0.2 0.2 0.4 0.1 0.2 0.2 0.2
  0.2 0.1 0.2 0.2 0.3 0.3 0.2 0.6 0.4 0.3 0.2 0.2 0.2 0.2 1.4 1.5 1.5 1.3
  1.5 1.3 1.6 1.  1.3 1.4 1.  1.5 1.  1.4 1.3 1.4 1.5 1.  1.5 1.1 1.8 1.3
  1.5 1.2 1.3 1.4 1.4 1.7 1.5 1.  1.1 1.  1.2 1.6 1.5 1.6 1.5 1.3 1.3 1.3
  1.2 1.4 1.2 1.  1.3 1.2 1.3 1.3 1.1 1.3 2.5 1.9 2.1 1.8 2.2 2.1 1.7 1.8
  1.8 2.5 2.  1.9 2.1 2.  2.4 2.3 1.8 2.2 2.3 1.5 2.3 2.  2.  1.8 2.1 1.8
  1.8 1.8 2.1 1.6 1.9 2.  2.2 1.5 1.4 2.3 2.4 1.8 1.8 2.1 2.4 2.3 1.9 2.3
  2.5 2.3 1.9 2.  2.3 1.8]]
Out[18]:
Text(0,0.5,'sepal width (cm)')

Ploteando Histogramas y y Funciones de Densidad

In [19]:
plt.style.use('seaborn-white')
datos = np.random.randn(1000)
plt.hist(datos)
Out[19]:
(array([  4.,  29.,  98., 189., 243., 227., 148.,  49.,  11.,   2.]),
 array([-3.04843601, -2.39562373, -1.74281145, -1.08999917, -0.4371869 ,
         0.21562538,  0.86843766,  1.52124994,  2.17406221,  2.82687449,
         3.47968677]),
 <a list of 10 Patch objects>)
In [20]:
plt.hist(datos, bins=30, normed=True, alpha=0.5,
         histtype='stepfilled', color='steelblue',
         edgecolor='none')
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/matplotlib/axes/_axes.py:6571: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
Out[20]:
(array([0.0045955 , 0.        , 0.01378651, 0.02757301, 0.04135952,
        0.06433703, 0.12407855, 0.15165156, 0.17462907, 0.21139308,
        0.28492111, 0.37223565, 0.32628063, 0.38602215, 0.40440416,
        0.41819066, 0.34466264, 0.28032561, 0.22977509, 0.28492111,
        0.16543806, 0.12407855, 0.05514602, 0.04595502, 0.04135952,
        0.0045955 , 0.0045955 , 0.        , 0.0045955 , 0.0045955 ]),
 array([-3.04843601e+00, -2.83083192e+00, -2.61322782e+00, -2.39562373e+00,
        -2.17801964e+00, -1.96041554e+00, -1.74281145e+00, -1.52520736e+00,
        -1.30760327e+00, -1.08999917e+00, -8.72395082e-01, -6.54790990e-01,
        -4.37186897e-01, -2.19582805e-01, -1.97871198e-03,  2.15625381e-01,
         4.33229473e-01,  6.50833566e-01,  8.68437658e-01,  1.08604175e+00,
         1.30364584e+00,  1.52124994e+00,  1.73885403e+00,  1.95645812e+00,
         2.17406221e+00,  2.39166631e+00,  2.60927040e+00,  2.82687449e+00,
         3.04447858e+00,  3.26208268e+00,  3.47968677e+00]),
 <a list of 1 Patch objects>)

Tests de normalidad en Python

Para pruebas de normalidad siempre se plantean así las hipótesis.

Hipótesis:

H0: La muestra proviene de una distribución normal.

H1: La muestra no proviene de una distribución normal.

Nivel de Significancia: El nivel de significancia que se trabajarĆ” es de 0.05. Alpha=0.05

Criterio de Decisión

Si P < Alpha Se rechaza H0

Si p >= Alpha No se rechaza H0, es decir, los datos Sƍ siguen la normal

Test de Shapiro-Wilk

In [21]:
import scipy.stats
datos = np.random.randn(1000)
shapiro_resultados = scipy.stats.shapiro(datos)
print(shapiro_resultados)
p_value = shapiro_resultados[1]
print(p_value)
# interpretación
alpha = 0.05
if p_value > alpha:
	print('SĆ­ sigue la curva Normal (No se rechaza H0)')
else:
	print('No sigue la curva Normal (Se rechaza H0)')
(0.9990536570549011, 0.8993759155273438)
0.8993759155273438
SĆ­ sigue la curva Normal (No se rechaza H0)

Forma GrƔfica: Si los puntos se aproximan a la recta significa que los datos sƭ siguen la normal.

El Código es:

from statsmodels.graphics.gofplots import qqplot

from matplotlib import pyplot

qqplot(datos, line='s')

In [22]:
# QQ Plot
from statsmodels.graphics.gofplots import qqplot
from matplotlib import pyplot
# q-q plot
qqplot(datos, line='s')
Out[22]:

Test de Kolmogorov-Smirnov

In [23]:
ks_resultados = scipy.stats.kstest(datos, cdf='norm')
print(ks_resultados)
p_value = ks_resultados[1]
print(p_value)
# interpretación
alpha = 0.05
if p_value > alpha:
	print('SĆ­ sigue la curva Normal (No se rechaza H0)')
else:
	print('No sigue la curva Normal (Se rechaza H0)')
KstestResult(statistic=0.028227678990131677, pvalue=0.39883582874041457)
0.39883582874041457
SĆ­ sigue la curva Normal (No se rechaza H0)

Ejemplo Dígitos, los vamos a usar más adelante en el curso

In [24]:
from sklearn.datasets import load_digits
digitos = load_digits(n_class=6)
print(digitos)
{'data': array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  0., ...,  9.,  0.,  0.],
       [ 0.,  0.,  0., ...,  4.,  0.,  0.],
       [ 0.,  0.,  6., ...,  6.,  0.,  0.]]), 'target': array([0, 1, 2, ..., 4, 4, 0]), 'target_names': array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), 'images': array([[[ 0.,  0.,  5., ...,  1.,  0.,  0.],
        [ 0.,  0., 13., ..., 15.,  5.,  0.],
        [ 0.,  3., 15., ..., 11.,  8.,  0.],
        ...,
        [ 0.,  4., 11., ..., 12.,  7.,  0.],
        [ 0.,  2., 14., ..., 12.,  0.,  0.],
        [ 0.,  0.,  6., ...,  0.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  5.,  0.,  0.],
        [ 0.,  0.,  0., ...,  9.,  0.,  0.],
        [ 0.,  0.,  3., ...,  6.,  0.,  0.],
        ...,
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  1., ...,  6.,  0.,  0.],
        [ 0.,  0.,  0., ..., 10.,  0.,  0.]],

       [[ 0.,  0.,  0., ..., 12.,  0.,  0.],
        [ 0.,  0.,  3., ..., 14.,  0.,  0.],
        [ 0.,  0.,  8., ..., 16.,  0.,  0.],
        ...,
        [ 0.,  9., 16., ...,  0.,  0.,  0.],
        [ 0.,  3., 13., ..., 11.,  5.,  0.],
        [ 0.,  0.,  0., ..., 16.,  9.,  0.]],

       ...,

       [[ 0.,  0.,  0., ...,  6.,  0.,  0.],
        [ 0.,  0.,  0., ...,  2.,  0.,  0.],
        [ 0.,  0.,  8., ...,  1.,  2.,  0.],
        ...,
        [ 0., 12., 16., ..., 16.,  1.,  0.],
        [ 0.,  1.,  7., ..., 13.,  0.,  0.],
        [ 0.,  0.,  0., ...,  9.,  0.,  0.]],

       [[ 0.,  0.,  0., ...,  4.,  0.,  0.],
        [ 0.,  0.,  4., ...,  0.,  0.,  0.],
        [ 0.,  0., 12., ...,  4.,  3.,  0.],
        ...,
        [ 0., 12., 16., ..., 13.,  0.,  0.],
        [ 0.,  0.,  4., ...,  8.,  0.,  0.],
        [ 0.,  0.,  0., ...,  4.,  0.,  0.]],

       [[ 0.,  0.,  6., ..., 11.,  1.,  0.],
        [ 0.,  0., 16., ..., 16.,  1.,  0.],
        [ 0.,  3., 16., ..., 13.,  6.,  0.],
        ...,
        [ 0.,  5., 16., ..., 16.,  5.,  0.],
        [ 0.,  1., 15., ..., 16.,  1.,  0.],
        [ 0.,  0.,  6., ...,  6.,  0.,  0.]]]), 'DESCR': ".. _digits_dataset:\n\nOptical recognition of handwritten digits dataset\n--------------------------------------------------\n\n**Data Set Characteristics:**\n\n    :Number of Instances: 5620\n    :Number of Attributes: 64\n    :Attribute Information: 8x8 image of integer pixels in the range 0..16.\n    :Missing Attribute Values: None\n    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)\n    :Date: July; 1998\n\nThis is a copy of the test set of the UCI ML hand-written digits datasets\nhttp://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits\n\nThe data set contains images of hand-written digits: 10 classes where\neach class refers to a digit.\n\nPreprocessing programs made available by NIST were used to extract\nnormalized bitmaps of handwritten digits from a preprinted form. From a\ntotal of 43 people, 30 contributed to the training set and different 13\nto the test set. 32x32 bitmaps are divided into nonoverlapping blocks of\n4x4 and the number of on pixels are counted in each block. This generates\nan input matrix of 8x8 where each element is an integer in the range\n0..16. This reduces dimensionality and gives invariance to small\ndistortions.\n\nFor info on NIST preprocessing routines, see M. D. Garris, J. L. Blue, G.\nT. Candela, D. L. Dimmick, J. Geist, P. J. Grother, S. A. Janet, and C.\nL. Wilson, NIST Form-Based Handprint Recognition System, NISTIR 5469,\n1994.\n\n.. topic:: References\n\n  - C. Kaynak (1995) Methods of Combining Multiple Classifiers and Their\n    Applications to Handwritten Digit Recognition, MSc Thesis, Institute of\n    Graduate Studies in Science and Engineering, Bogazici University.\n  - E. Alpaydin, C. Kaynak (1998) Cascading Classifiers, Kybernetika.\n  - Ken Tang and Ponnuthurai N. Suganthan and Xi Yao and A. Kai Qin.\n    Linear dimensionalityreduction using relevance weighted LDA. School of\n    Electrical and Electronic Engineering Nanyang Technological University.\n    2005.\n  - Claudio Gentile. A New Approximate Maximal Margin Classification\n    Algorithm. NIPS. 2000."}
In [25]:
fig, ax = plt.subplots(8, 8, figsize=(6, 6))
for i, axi in enumerate(ax.flat):
    axi.imshow(digitos.images[i], cmap='binary')
    axi.set(xticks=[], yticks=[])

Gráficos 3D

In [26]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D # Para una diferente version de matplotlib

fig = plt.figure()
ax = Axes3D(fig)

Ejemplos

In [27]:
fig = plt.figure()
ax = Axes3D(fig)

# Datos para la lĆ­nea en 3D
zline = np.linspace(0, 15, 1000)
xline = np.sin(zline)
yline = np.cos(zline)
ax.plot3D(xline, yline, zline, 'gray')
Out[27]:
[<mpl_toolkits.mplot3d.art3d.Line3D at 0x1a1d75b828>]
In [28]:
fig = plt.figure()
ax = Axes3D(fig)
zdata = 15 * np.random.random(100)
xdata = np.sin(zdata) + 0.1 * np.random.randn(100)
ydata = np.cos(zdata) + 0.1 * np.random.randn(100)
ax.scatter3D(xdata, ydata, zdata, c=zdata, cmap='Greens')
Out[28]:
<mpl_toolkits.mplot3d.art3d.Path3DCollection at 0x1a1d8a3860>

 Gráfico de pares con el el paquete Seaborn

In [29]:
import seaborn as sns
iris = sns.load_dataset("iris")
print(iris.head())
corr = sns.pairplot(iris, hue='species', size=2.5)
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/seaborn/axisgrid.py:2065: UserWarning: The `size` parameter has been renamed to `height`; pleaes update your code.
  warnings.warn(msg, UserWarning)
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval

Análisis Exploratorio Básico

Paso 1: Cargar la tabla de datos

In [30]:
import pandas as pd
import prince
import os
import pandas as pd
import numpy as np
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
print(os.getcwd())
datos = pd.read_csv('SAheart.csv',delimiter=';',decimal=".")
print(datos.head())
print(datos.shape)
/Users/oldemarrodriguez/Google Drive/MDCurso/Datos
   sbp  tobacco   ldl  adiposity  famhist  typea  obesity  alcohol  age chd
0  160    12.00  5.73      23.11  Present     49    25.30    97.20   52  Si
1  144     0.01  4.41      28.61   Absent     55    28.87     2.06   63  Si
2  118     0.08  3.48      32.28  Present     52    29.14     3.81   46  No
3  170     7.50  6.41      38.03  Present     51    31.99    24.26   58  Si
4  134    13.60  3.50      27.78  Present     60    25.99    57.34   49  Si
(462, 10)

Paso 2: Presentación de estadísticas básicas

describe() es como el summary de R para las variables numƩricas

In [31]:
print(datos.dropna().describe())
print(datos.describe())
print(datos.mean(numeric_only=True))
print(datos.median(numeric_only=True))
print(datos.std(numeric_only=True))
print(datos.max(numeric_only=True))
              sbp     tobacco         ldl   adiposity       typea     obesity  \
count  462.000000  462.000000  462.000000  462.000000  462.000000  462.000000   
mean   138.326840    3.635649    4.740325   25.406732   53.103896   26.044113   
std     20.496317    4.593024    2.070909    7.780699    9.817534    4.213680   
min    101.000000    0.000000    0.980000    6.740000   13.000000   14.700000   
25%    124.000000    0.052500    3.282500   19.775000   47.000000   22.985000   
50%    134.000000    2.000000    4.340000   26.115000   53.000000   25.805000   
75%    148.000000    5.500000    5.790000   31.227500   60.000000   28.497500   
max    218.000000   31.200000   15.330000   42.490000   78.000000   46.580000   

          alcohol         age  
count  462.000000  462.000000  
mean    17.044394   42.816017  
std     24.481059   14.608956  
min      0.000000   15.000000  
25%      0.510000   31.000000  
50%      7.510000   45.000000  
75%     23.892500   55.000000  
max    147.190000   64.000000  
              sbp     tobacco         ldl   adiposity       typea     obesity  \
count  462.000000  462.000000  462.000000  462.000000  462.000000  462.000000   
mean   138.326840    3.635649    4.740325   25.406732   53.103896   26.044113   
std     20.496317    4.593024    2.070909    7.780699    9.817534    4.213680   
min    101.000000    0.000000    0.980000    6.740000   13.000000   14.700000   
25%    124.000000    0.052500    3.282500   19.775000   47.000000   22.985000   
50%    134.000000    2.000000    4.340000   26.115000   53.000000   25.805000   
75%    148.000000    5.500000    5.790000   31.227500   60.000000   28.497500   
max    218.000000   31.200000   15.330000   42.490000   78.000000   46.580000   

          alcohol         age  
count  462.000000  462.000000  
mean    17.044394   42.816017  
std     24.481059   14.608956  
min      0.000000   15.000000  
25%      0.510000   31.000000  
50%      7.510000   45.000000  
75%     23.892500   55.000000  
max    147.190000   64.000000  
sbp          138.326840
tobacco        3.635649
ldl            4.740325
adiposity     25.406732
typea         53.103896
obesity       26.044113
alcohol       17.044394
age           42.816017
dtype: float64
sbp          134.000
tobacco        2.000
ldl            4.340
adiposity     26.115
typea         53.000
obesity       25.805
alcohol        7.510
age           45.000
dtype: float64
sbp          20.496317
tobacco       4.593024
ldl           2.070909
adiposity     7.780699
typea         9.817534
obesity       4.213680
alcohol      24.481059
age          14.608956
dtype: float64
sbp          218.00
tobacco       31.20
ldl           15.33
adiposity     42.49
typea         78.00
obesity       46.58
alcohol      147.19
age           64.00
dtype: float64

Los percentiles

In [32]:
print(datos.quantile(np.array([0,.25,.50,.75,1])))
        sbp  tobacco      ldl  adiposity  typea  obesity   alcohol   age
0.00  101.0   0.0000   0.9800     6.7400   13.0  14.7000    0.0000  15.0
0.25  124.0   0.0525   3.2825    19.7750   47.0  22.9850    0.5100  31.0
0.50  134.0   2.0000   4.3400    26.1150   53.0  25.8050    7.5100  45.0
0.75  148.0   5.5000   5.7900    31.2275   60.0  28.4975   23.8925  55.0
1.00  218.0  31.2000  15.3300    42.4900   78.0  46.5800  147.1900  64.0

Contando datos en las variables categóricas

In [33]:
print(pd.crosstab(index=datos["chd"],columns="count"))
print(pd.crosstab(index=datos["famhist"],columns="count"))
col_0  count
chd         
No       302
Si       160
col_0    count
famhist       
Absent     270
Present    192

Otra forma

In [34]:
print(datos['chd'].value_counts())
print(datos["famhist"].value_counts())
No    302
Si    160
Name: chd, dtype: int64
Absent     270
Present    192
Name: famhist, dtype: int64

Tablas cruzadas

In [35]:
famhist_chd = pd.crosstab(index=datos["famhist"], columns=datos["chd"])
print(famhist_chd)
famhist_chd.index = ["Absent","Present"]
print(famhist_chd)
chd       No  Si
famhist         
Absent   206  64
Present   96  96
chd       No  Si
Absent   206  64
Present   96  96

Otra forma

In [36]:
g_chd = pd.crosstab(index=datos["chd"],columns="count") 
print(g_chd) 
print(g_chd['count'][0])
print(g_chd['count'][1])
g_famhist = pd.crosstab(index=datos["famhist"],columns="count") 
print(g_famhist)
print(g_famhist['count'][0])
print(g_famhist['count'][1])
col_0  count
chd         
No       302
Si       160
302
160
col_0    count
famhist       
Absent     270
Present    192
270
192

Paso 3: Gráficos importantes

GrÔfico de la distribución de la variable chd

In [37]:
import matplotlib.pyplot as plt
alto = [g_chd['count'][0], g_chd['count'][1]]
barras = ('No', 'SĆ­')
y_pos = np.arange(len(barras))
plt.bar(y_pos, alto, color=['red','blue'])
plt.xticks(y_pos, barras)
Out[37]:
([<matplotlib.axis.XTick at 0x1a1ef4e390>,
  <matplotlib.axis.XTick at 0x1a1ef50c18>],
 <a list of 2 Text xticklabel objects>)

GrÔfico de la distribución de la variable famhist

In [38]:
alto = [g_famhist['count'][0], g_famhist['count'][1]]
barras = ('Absent ', 'Present')
y_pos = np.arange(len(barras))
plt.bar(y_pos, alto, color=['red','blue'])
plt.xticks(y_pos, barras)
Out[38]:
([<matplotlib.axis.XTick at 0x1a1efbb208>,
  <matplotlib.axis.XTick at 0x1a1efb8ac8>],
 <a list of 2 Text xticklabel objects>)

Box Plots

In [39]:
datos.head()
boxplots = datos.boxplot(return_type='axes')

Funciones de densidad

In [40]:
densidad = datos[datos.columns[:1]].plot(kind='density')
densidad = datos[datos.columns[8:9]].plot(kind='density')
densidad = datos['age'].plot(kind='density')
densidad = datos[datos.columns[:10]].plot(kind='density')

Histogramas

In [41]:
densidad = datos[datos.columns[:1]].plot(kind='hist')
densidad = datos[datos.columns[8:9]].plot(kind='hist')
densidad = datos['age'].plot(kind='hist')
densidad = datos[datos.columns[:10]].plot(kind='hist')

GrƔfico de pares de todas las variables 2 a 2

In [42]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.pairplot(datos, hue='chd', size=2.5)
sns.pairplot(datos, hue='famhist', size=2.5)
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/seaborn/axisgrid.py:2065: UserWarning: The `size` parameter has been renamed to `height`; pleaes update your code.
  warnings.warn(msg, UserWarning)
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval
/Users/oldemarrodriguez/anaconda3/lib/python3.7/site-packages/seaborn/axisgrid.py:2065: UserWarning: The `size` parameter has been renamed to `height`; pleaes update your code.
  warnings.warn(msg, UserWarning)
Out[42]:
<seaborn.axisgrid.PairGrid at 0x1a1f58dd68>

Calculando y graficando correlaciones

Nota: Es "inteligente" e ingnora las variables categóricas

In [43]:
corr = datos.corr()
print(corr)
f, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
                sbp   tobacco       ldl  adiposity     typea   obesity  \
sbp        1.000000  0.212247  0.158296   0.356500 -0.057454  0.238067   
tobacco    0.212247  1.000000  0.158905   0.286640 -0.014608  0.124529   
ldl        0.158296  0.158905  1.000000   0.440432  0.044048  0.330506   
adiposity  0.356500  0.286640  0.440432   1.000000 -0.043144  0.716556   
typea     -0.057454 -0.014608  0.044048  -0.043144  1.000000  0.074006   
obesity    0.238067  0.124529  0.330506   0.716556  0.074006  1.000000   
alcohol    0.140096  0.200813 -0.033403   0.100330  0.039498  0.051620   
age        0.388771  0.450330  0.311799   0.625954 -0.102606  0.291777   

            alcohol       age  
sbp        0.140096  0.388771  
tobacco    0.200813  0.450330  
ldl       -0.033403  0.311799  
adiposity  0.100330  0.625954  
typea      0.039498 -0.102606  
obesity    0.051620  0.291777  
alcohol    1.000000  0.101125  
age        0.101125  1.000000  
Out[43]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a23420630>

Contando la cantidad en cada categoría

In [44]:
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
#print(os.getcwd())
datos = pd.read_csv('SAheart.csv',delimiter=';',decimal=".")
print(datos.head())

# Contando la cantidad en cada categorĆ­a
print(pd.value_counts(datos["chd"]))

# Es equivalente
print(datos['chd'].value_counts())
   sbp  tobacco   ldl  adiposity  famhist  typea  obesity  alcohol  age chd
0  160    12.00  5.73      23.11  Present     49    25.30    97.20   52  Si
1  144     0.01  4.41      28.61   Absent     55    28.87     2.06   63  Si
2  118     0.08  3.48      32.28  Present     52    29.14     3.81   46  No
3  170     7.50  6.41      38.03  Present     51    31.99    24.26   58  Si
4  134    13.60  3.50      27.78  Present     60    25.99    57.34   49  Si
No    302
Si    160
Name: chd, dtype: int64
No    302
Si    160
Name: chd, dtype: int64

Recodificando variables

La siguiente función recodifica usando pandas una categoría con números

Nota: Esto NO convierte la variable en numƩrica.

In [45]:
def recodificar(col, nuevo_codigo):
  col_cod = pd.Series(col, copy=True)
  for llave, valor in nuevo_codigo.items():
    col_cod.replace(llave, valor, inplace=True)
  return col_cod

Ejemplo

In [46]:
datos["chd"] = recodificar(datos["chd"], {'No':0,'Si':1})
print(datos.head())

# Conteo
print(pd.value_counts(datos["chd"]))
# Es equivalente
print(datos['chd'].value_counts())
   sbp  tobacco   ldl  adiposity  famhist  typea  obesity  alcohol  age  chd
0  160    12.00  5.73      23.11  Present     49    25.30    97.20   52    1
1  144     0.01  4.41      28.61   Absent     55    28.87     2.06   63    1
2  118     0.08  3.48      32.28  Present     52    29.14     3.81   46    0
3  170     7.50  6.41      38.03  Present     51    31.99    24.26   58    1
4  134    13.60  3.50      27.78  Present     60    25.99    57.34   49    1
0    302
1    160
Name: chd, dtype: int64
0    302
1    160
Name: chd, dtype: int64

A la inversa: Conviertiendo un nĆŗmero en una categorĆ­a

In [47]:
datos["chd"] = recodificar(datos["chd"], {0:'No',1:'Si'})
print(datos.head())
   sbp  tobacco   ldl  adiposity  famhist  typea  obesity  alcohol  age chd
0  160    12.00  5.73      23.11  Present     49    25.30    97.20   52  Si
1  144     0.01  4.41      28.61   Absent     55    28.87     2.06   63  Si
2  118     0.08  3.48      32.28  Present     52    29.14     3.81   46  No
3  170     7.50  6.41      38.03  Present     51    31.99    24.26   58  Si
4  134    13.60  3.50      27.78  Present     60    25.99    57.34   49  Si

Análisis en Componentes Principales - ACP

Ejemplo 1

In [48]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

from sklearn.datasets import load_digits
digits = load_digits()
print(digits.data.shape)

pca = PCA(2)  # Reduce las dimensiones a 2
componentes = pca.fit_transform(digits.data)
print(digits.data.shape)
print(componentes.shape)

plt.scatter(componentes[:, 0], componentes[:, 1],
            c=digits.target, edgecolor='none', alpha=0.5,
            cmap=plt.cm.get_cmap('viridis', 10))
plt.xlabel('componente 1')
plt.ylabel('componente 2')
plt.colorbar()
(1797, 64)
(1797, 64)
(1797, 2)
Out[48]:
<matplotlib.colorbar.Colorbar at 0x1a20ccef98>

Ejemplo 2

In [49]:
import os
import pandas as pd

os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
datos = pd.read_csv('EjemploEstudiantes.csv',delimiter=';',decimal=",",index_col=0)
print(datos)
print(datos.head())
print(datos.shape)

pca = PCA(n_components=2)
componentes = pca.fit_transform(datos)
print(componentes)
print(datos.shape)
print(componentes.shape)

plt.scatter(componentes[:, 0], componentes[:, 1])
plt.xlabel('componente 1')
plt.ylabel('componente 2')
        Matematicas  Ciencias  Espanol  Historia  EdFisica
Lucia           7.0       6.5      9.2       8.6       8.0
Pedro           7.5       9.4      7.3       7.0       7.0
Ines            7.6       9.2      8.0       8.0       7.5
Luis            5.0       6.5      6.5       7.0       9.0
Andres          6.0       6.0      7.8       8.9       7.3
Ana             7.8       9.6      7.7       8.0       6.5
Carlos          6.3       6.4      8.2       9.0       7.2
Jose            7.9       9.7      7.5       8.0       6.0
Sonia           6.0       6.0      6.5       5.5       8.7
Maria           6.8       7.2      8.7       9.0       7.0
        Matematicas  Ciencias  Espanol  Historia  EdFisica
Lucia           7.0       6.5      9.2       8.6       8.0
Pedro           7.5       9.4      7.3       7.0       7.0
Ines            7.6       9.2      8.0       8.0       7.5
Luis            5.0       6.5      6.5       7.0       9.0
Andres          6.0       6.0      7.8       8.9       7.3
(10, 5)
[[-0.76471745 -1.5817637 ]
 [ 1.66887794  1.39196556]
 [ 1.57822841  0.29949595]
 [-2.60701317  1.32020402]
 [-1.43877557 -1.33566867]
 [ 2.34790534  0.3880845 ]
 [-0.89372557 -1.51890124]
 [ 2.64984571  0.4254636 ]
 [-2.62959083  2.18339513]
 [ 0.08896518 -1.57227516]]
(10, 5)
(10, 2)
Out[49]:
Text(0,0.5,'componente 2')

ACP con el paquete "prince"

En Mac (Terminal):

pip install git+https://github.com/MaxHalford/Prince

En Windows (Anaconda Prompt):

pip install git+https://github.com/MaxHalford/Prince

Prince en githup:

Instalando paquetes

https://docs.python.org/3/installing/

Clase ACP propia (utiliza prince para los cálculos)

In [50]:
import matplotlib.pyplot as plt
from prince import PCA

class ACP:
    def __init__(self, datos, n_componentes = 5): 
        self.__datos = datos
        self.__modelo = PCA(n_components = n_componentes).fit(self.__datos)
        self.__correlacion_var = self.__modelo.column_correlations(datos)
        self.__coordenadas_ind = self.__modelo.row_coordinates(datos)
        self.__contribucion_ind = self.__modelo.row_contributions(datos)
        self.__cos2_ind = self.__modelo.row_cosine_similarities(datos)
        self.__var_explicada = [x * 100 for x in self.__modelo.explained_inertia_]
    @property
    def datos(self):
        return self.__datos
    @datos.setter
    def datos(self, datos):
        self.__datos = datos
    @property
    def modelo(self):
        return self.__modelo
    @property
    def correlacion_var(self):
        return self.__correlacion_var
    @property
    def coordenadas_ind(self):
        return self.__coordenadas_ind
    @property
    def contribucion_ind(self):
        return self.__contribucion_ind
    @property
    def cos2_ind(self):
        return self.__cos2_ind
    @property
    def var_explicada(self):
        return self.__var_explicada
        self.__var_explicada = var_explicada
    def plot_plano_principal(self, ejes = [0, 1], ind_labels = True, titulo = 'Plano Principal'):
        x = self.coordenadas_ind[ejes[0]].values
        y = self.coordenadas_ind[ejes[1]].values
        plt.style.use('seaborn-whitegrid')
        plt.scatter(x, y, color = 'gray')
        plt.title(titulo)
        plt.axhline(y = 0, color = 'dimgrey', linestyle = '--')
        plt.axvline(x = 0, color = 'dimgrey', linestyle = '--')
        inercia_x = round(self.var_explicada[ejes[0]], 2)
        inercia_y = round(self.var_explicada[ejes[1]], 2)
        plt.xlabel('Componente ' + str(ejes[0]) + ' (' + str(inercia_x) + '%)')
        plt.ylabel('Componente ' + str(ejes[1]) + ' (' + str(inercia_y) + '%)')
        if ind_labels:
            for i, txt in enumerate(self.coordenadas_ind.index):
                plt.annotate(txt, (x[i], y[i]))
    def plot_circulo(self, ejes = [0, 1], var_labels = True, titulo = 'Círculo de Correlación'):
        cor = self.correlacion_var.iloc[:, ejes].values
        plt.style.use('seaborn-whitegrid')
        c = plt.Circle((0, 0), radius = 1, color = 'steelblue', fill = False)
        plt.gca().add_patch(c)
        plt.axis('scaled')
        plt.title(titulo)
        plt.axhline(y = 0, color = 'dimgrey', linestyle = '--')
        plt.axvline(x = 0, color = 'dimgrey', linestyle = '--')
        inercia_x = round(self.var_explicada[ejes[0]], 2)
        inercia_y = round(self.var_explicada[ejes[1]], 2)
        plt.xlabel('Componente ' + str(ejes[0]) + ' (' + str(inercia_x) + '%)')
        plt.ylabel('Componente ' + str(ejes[1]) + ' (' + str(inercia_y) + '%)')
        for i in range(cor.shape[0]):
            plt.arrow(0, 0, cor[i, 0] * 0.95, cor[i, 1] * 0.95, color = 'steelblue', 
                      alpha = 0.5, head_width = 0.05, head_length = 0.05)
            if var_labels:
                plt.text(cor[i, 0] * 1.05, cor[i, 1] * 1.05, self.correlacion_var.index[i], 
                         color = 'steelblue', ha = 'center', va = 'center')
    def plot_sobreposicion(self, ejes = [0, 1], ind_labels = True, 
                      var_labels = True, titulo = 'Sobreposición Plano-Círculo'):
        x = self.coordenadas_ind[ejes[0]].values
        y = self.coordenadas_ind[ejes[1]].values
        cor = self.correlacion_var.iloc[:, ejes]
        scale = min((max(x) - min(x)/(max(cor[ejes[0]]) - min(cor[ejes[0]]))), 
                    (max(y) - min(y)/(max(cor[ejes[1]]) - min(cor[ejes[1]])))) * 0.7
        cor = self.correlacion_var.iloc[:, ejes].values
        plt.style.use('seaborn-whitegrid')
        plt.axhline(y = 0, color = 'dimgrey', linestyle = '--')
        plt.axvline(x = 0, color = 'dimgrey', linestyle = '--')
        inercia_x = round(self.var_explicada[ejes[0]], 2)
        inercia_y = round(self.var_explicada[ejes[1]], 2)
        plt.xlabel('Componente ' + str(ejes[0]) + ' (' + str(inercia_x) + '%)')
        plt.ylabel('Componente ' + str(ejes[1]) + ' (' + str(inercia_y) + '%)')
        plt.scatter(x, y, color = 'gray')
        if ind_labels:
            for i, txt in enumerate(self.coordenadas_ind.index):
                plt.annotate(txt, (x[i], y[i]))
        for i in range(cor.shape[0]):
            plt.arrow(0, 0, cor[i, 0] * scale, cor[i, 1] * scale, color = 'steelblue', 
                      alpha = 0.5, head_width = 0.05, head_length = 0.05)
            if var_labels:
                plt.text(cor[i, 0] * scale * 1.15, cor[i, 1] * scale * 1.15, 
                         self.correlacion_var.index[i], 
                         color = 'steelblue', ha = 'center', va = 'center')

Ejemplo 1

In [51]:
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
datos = pd.read_csv('EjemploEstudiantes.csv',delimiter=';',decimal=",",index_col=0)
print(datos)
print(datos.shape)

# Declara la instancia de clase
acp = ACP(datos,n_componentes=3)
# Despliega las Componenentes Principales
print(acp.coordenadas_ind)
# Despliega los cosenos cuadrados de los individuos
print(acp.cos2_ind)
# Despliega las correlaciones de las variables con respecto a las componentes
print(acp.correlacion_var)
        Matematicas  Ciencias  Espanol  Historia  EdFisica
Lucia           7.0       6.5      9.2       8.6       8.0
Pedro           7.5       9.4      7.3       7.0       7.0
Ines            7.6       9.2      8.0       8.0       7.5
Luis            5.0       6.5      6.5       7.0       9.0
Andres          6.0       6.0      7.8       8.9       7.3
Ana             7.8       9.6      7.7       8.0       6.5
Carlos          6.3       6.4      8.2       9.0       7.2
Jose            7.9       9.7      7.5       8.0       6.0
Sonia           6.0       6.0      6.5       5.5       8.7
Maria           6.8       7.2      8.7       9.0       7.0
(10, 5)
               0         1         2
Lucia  -0.323063  1.772525  1.198801
Pedro  -0.665441 -1.638702  0.145476
Ines   -1.002547 -0.515692  0.628888
Luis    3.172095 -0.262782 -0.381960
Andres  0.488868  1.365402 -0.835236
Ana    -1.708633 -1.021700 -0.127077
Carlos -0.067586  1.462336 -0.506240
Jose   -2.011855 -1.275865 -0.542150
Sonia   3.042030 -1.254881  0.448829
Maria  -0.923869  1.369359 -0.029330
               0         1         2
Lucia   0.022285  0.670856  0.306859
Pedro   0.140605  0.852675  0.006720
Ines    0.603107  0.159575  0.237318
Luis    0.979085  0.006719  0.014196
Andres  0.085326  0.665608  0.249066
Ana     0.733626  0.262316  0.004058
Carlos  0.001904  0.891281  0.106815
Jose    0.678061  0.272700  0.049240
Sonia   0.838971  0.142766  0.018263
Maria   0.312702  0.686982  0.000315
                    0         1         2
Ciencias    -0.722798 -0.648395  0.023840
EdFisica     0.913926  0.119637  0.340651
Espanol     -0.610893  0.717321  0.331025
Historia    -0.599923  0.748470 -0.232063
Matematicas -0.895798 -0.345204  0.257979
In [52]:
# Plotea el plano principal
acp.plot_plano_principal()
In [53]:
# Plotea el círculo de correlación
acp.plot_circulo()
In [54]:
# Plotea la sobreposición plano-correlación
acp.plot_sobreposicion()

Graficando en componentes 1 y 3 (Numeradas con 0 y 2 en Python)

In [55]:
# Plotea el plano principal
acp.plot_plano_principal(ejes = [0, 2])
In [56]:
# Plotea el círculo de correlación
acp.plot_circulo(ejes = [0, 2])
In [57]:
# Plotea la sobreposición plkano-correlación
acp.plot_sobreposicion(ejes = [0, 2])

Ejemplo 2

In [58]:
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
iris = pd.read_csv('iris.csv',delimiter=';',decimal=".")
print(iris.head())
print(iris.shape)
iris2 = pd.DataFrame(data=iris, columns=['s.largo', 's.ancho', 'p.largo', 'p.ancho'])
acp = ACP(iris2,n_componentes=4)
   s.largo  s.ancho  p.largo  p.ancho    tipo
0      5.1      3.5      1.4      0.2  setosa
1      4.9      3.0      1.4      0.2  setosa
2      4.7      3.2      1.3      0.2  setosa
3      4.6      3.1      1.5      0.2  setosa
4      5.0      3.6      1.4      0.2  setosa
(150, 5)
In [59]:
# Plotea el plano principal
acp.plot_plano_principal()
In [60]:
# Plotea el círculo de correlación
acp.plot_circulo()
In [61]:
# Plotea la sobreposición plano-correlación
acp.plot_sobreposicion()

Análisis en Componentes Principales con variables categóricas

Códigos Dummy (Códigos Disyuntivos Completos) - pd.get_dummies(Datos)

Ejemplo 3

In [62]:
os.chdir("/Users/oldemarrodriguez/Google Drive/MDCurso/Datos")
datos = pd.read_csv('EjemploEstudiantes_Categoricas.csv',delimiter=';',decimal=",",index_col=0)
print(datos.head())
print(datos.shape)
print(datos.dtypes)

# Recodificando la variable "Conducta" usando texto y luego se convierten a variables Dummy
datos["Conducta"] = recodificar(datos["Conducta"], {1:'Mala',2:'Regular',3:'Buena'})
print(datos.head())
print(datos.dtypes)
# Conviertiendo la variables en Dummy
datos_dummy = pd.get_dummies(datos)
print(datos_dummy.head())
print(datos_dummy.dtypes)

acp = ACP(datos_dummy,n_componentes=3)
        Matematicas  Ciencias  Espanol  Historia  EdFisica Genero  Conducta
Lucia           7.0       6.5      9.2       8.6       8.0      F         3
Pedro           7.5       9.4      7.3       7.0       7.0      M         2
Ines            7.6       9.2      8.0       8.0       7.5      F         2
Luis            5.0       6.5      6.5       7.0       9.0      M         1
Andres          6.0       6.0      7.8       8.9       7.3      M         2
(10, 7)
Matematicas    float64
Ciencias       float64
Espanol        float64
Historia       float64
EdFisica       float64
Genero          object
Conducta         int64
dtype: object
        Matematicas  Ciencias  Espanol  Historia  EdFisica Genero Conducta
Lucia           7.0       6.5      9.2       8.6       8.0      F    Buena
Pedro           7.5       9.4      7.3       7.0       7.0      M  Regular
Ines            7.6       9.2      8.0       8.0       7.5      F  Regular
Luis            5.0       6.5      6.5       7.0       9.0      M     Mala
Andres          6.0       6.0      7.8       8.9       7.3      M  Regular
Matematicas    float64
Ciencias       float64
Espanol        float64
Historia       float64
EdFisica       float64
Genero          object
Conducta        object
dtype: object
        Matematicas  Ciencias  Espanol  Historia  EdFisica  Genero_F  \
Lucia           7.0       6.5      9.2       8.6       8.0         1   
Pedro           7.5       9.4      7.3       7.0       7.0         0   
Ines            7.6       9.2      8.0       8.0       7.5         1   
Luis            5.0       6.5      6.5       7.0       9.0         0   
Andres          6.0       6.0      7.8       8.9       7.3         0   

        Genero_M  Conducta_Buena  Conducta_Mala  Conducta_Regular  
Lucia          0               1              0                 0  
Pedro          1               0              0                 1  
Ines           0               0              0                 1  
Luis           1               0              1                 0  
Andres         1               0              0                 1  
Matematicas         float64
Ciencias            float64
Espanol             float64
Historia            float64
EdFisica            float64
Genero_F              uint8
Genero_M              uint8
Conducta_Buena        uint8
Conducta_Mala         uint8
Conducta_Regular      uint8
dtype: object
In [63]:
# Plotea el Plano Principal
acp.plot_plano_principal()
In [64]:
# Plotea el círculo de correlación
acp.plot_circulo()
In [65]:
# Plotea la sobreposición plano-correlación
acp.plot_sobreposicion()